Candace Savonen - CCDL for ALSF
This notebook is a first pass in evaluating the concordance between MuTect2 and Strelka2 results.
It addresses issue # 30 in OpenPBTA.
* Some of these set up steps will be removed once we have a Dockerfile that installs maftools automatically.
# We need maftools - this will be added to the running Docker issue whenever it is up
if (!('maftools' %in% installed.packages())) {
devtools::install_github("PoisonAlien/maftools")
}
if (!('hexbin' %in% installed.packages())) {
install.packages("hexbin")
}
if (!('colorblindr' %in% installed.packages())) {
devtools::install_github("clauswilke/colorblindr")
}
# Get magrittr pipe
`%>%` <- dplyr::`%>%`
Create a output directories.
if (!dir.exists("results")) {
dir.create("results")
}
if (!dir.exists("plots")) {
dir.create("plots")
}
This will be changed once the data download script is finalized.
Prep the metadata to be used as the clinicalData for maftools it it hasn’t been prepped yet. This whole chunk may need to be removed be taken out after issue 2 regarding the data download script is addressed.
These steps are only to make the metadata and MuTect2 and Strelka2 datasets parallel in what samples they contain.
There are some samples in the metadata that are not in the MuTect2 and Strelka2 data.
if (file.exists(file.path("..", "..", "scratch", "metadata_filtered.tsv"))) {
metadata <- readr::read_tsv(file.path("..", "..", "scratch", "metadata_filtered.tsv"))
}
Will read in as an maftools object from an RDS file, unless maftools has not been run on them yet. Running maftools takes a lot of computing power and time, so be prepared. If you trying to run this step in a Docker container, it will likely be out of memory killed, unless you have ~50GB you can allot to Docker.
# Load in the RDS file if it exists, but if it doesn't exist, load in from
# original data file with maftools
if (!file.exists(file.path("..", "..", "scratch", "strelka2.RDS"))) {
strelka <- maftools::read.maf(file.path("..", "..", "data", "strelka2.maf.gz"),
clinicalData = metadata)
saveRDS(strelka, file.path("..", "..", "scratch", "strelka2.RDS"))
} else {
strelka2 <- readRDS(file.path("..", "..", "scratch", "strelka2.RDS"))
}
# Same for MuTect2
if (!file.exists(file.path("..", "..", "scratch", "mutect2.RDS"))) {
mutect2 <- maftools::read.maf(file.path("..", "..", "data", "mutect2.maf.gz"),
clinicalData = metadata)
saveRDS(mutect2, file.path("..", "..", "scratch", "mutect2.RDS"))
} else {
mutect2 <- readRDS(file.path("..", "..", "scratch", "mutect2.RDS"))
}
This is how I set up metadata_filtered.tsv. This should not need to be ran again and won’t run again unless metadata_filtered.tsv is misplaced from the scratch file.
if (!file.exists(file.path("..", "..", "scratch", "metadata_filtered.tsv"))) {
# Isolate metadata to only the samples that are in the datasets
metadata <- metadata %>%
dplyr::filter(sample_id %in% mutect2@clinical.data$Tumor_Sample_Barcode) %>%
dplyr::distinct(sample_id, .keep_all = TRUE) %>%
readr::write_tsv(file.path("..", "..", "scratch", "metadata_filtered.tsv"))
}
Check that samples are same order in the datasets as they are in the metadata
all.equal(as.factor(metadata$sample_id),
strelka2@clinical.data$Tumor_Sample_Barcode)
all.equal(as.factor(metadata$sample_id),
mutect2@clinical.data$Tumor_Sample_Barcode)
Get gene summaries and write to TSV files.
strelka2_gene_sum <- maftools::getGeneSummary(strelka2) %>%
readr::write_tsv(file.path("results",
"strelka2_gene_summary.tsv"))
mutect2_gene_sum <- maftools::getGeneSummary(mutect2) %>%
readr::write_tsv(file.path("results",
"mutect2_gene_summary.tsv"))
Get sample summaries and write to TSV files.
strelka2_sample_sum <- maftools::getSampleSummary(strelka2) %>%
readr::write_tsv(file.path("results",
"strelka2_sample_summary.tsv"))
mutect2_sample_sum <- maftools::getSampleSummary(mutect2) %>%
readr::write_tsv(file.path("results",
"mutect2_sample_summary.tsv"))
combined_gene <- mutect2_gene_sum %>%
dplyr::full_join(strelka2_gene_sum, by = 'Hugo_Symbol') %>%
reshape2::melt(id = 'Hugo_Symbol') %>%
dplyr::mutate(dataset = as.character(grepl(".x$", variable))) %>%
dplyr::mutate(dataset = dplyr::recode(dataset,
`TRUE` = "mutect2",
`FALSE` = "strelka2")) %>%
dplyr::mutate(variable = gsub(".x$|.y$", "", variable)) %>%
tidyr::spread('dataset', 'value')
Let’s get a correlation test on the genes overall.
cor.test(combined_gene$mutect2, combined_gene$strelka2, method = "spearman")
cor.test(combined_gene$mutect2, combined_gene$strelka2, method = "pearson")
combined_sample <- mutect2_sample_sum %>%
dplyr::full_join(strelka2_sample_sum, by = 'Tumor_Sample_Barcode') %>%
reshape2::melt(id = 'Tumor_Sample_Barcode') %>%
dplyr::mutate(dataset = as.character(grepl(".x$", variable))) %>%
dplyr::mutate(dataset = dplyr::recode(dataset,
`TRUE` = "mutect2",
`FALSE` = "strelka2")) %>%
dplyr::mutate(variable = gsub(".x$|.y$", "", variable)) %>%
tidyr::spread('dataset', 'value')
Let’s get a correlation test on the genes overall.
cor.test(combined_sample$mutect2, combined_sample$strelka2, method = "spearman")
cor.test(combined_sample$mutect2, combined_sample$strelka2, method = "pearson")
Here we will make these new variables for both Mutect2 and Strelka2 dataset: - Calculate VAF for each - Make a mutation ID by concatenating gene name, allele, tumor ID, and start position - Summarize the biotype variable for whether or not it is a coding gene.
Let’s do this for Strelka2 first.
strelka2_vaf <- strelka2@data %>%
dplyr::mutate(vaf = as.numeric(t_alt_count)/(as.numeric(t_ref_count) +
as.numeric(t_alt_count)),
mutation = paste0(Hugo_Symbol, "_",
Allele, "_",
Tumor_Sample_Barcode, "_",
Start_Position),
change = paste0(Reference_Allele, ">", Allele),
coding = dplyr::case_when(
BIOTYPE != "protein_coding" ~ "non-coding",
TRUE ~ "protein_coding")) %>%
dplyr::select(-which(apply(is.na(.), 2, all)))
NAs introduced by coercion
# Take a look at this df
strelka2_vaf
Now we will do the same for MuTect2.
mutect2_vaf <- mutect2@data %>%
dplyr::mutate(vaf = as.numeric(t_alt_count)/(as.numeric(t_ref_count) +
as.numeric(t_alt_count)),
mutation = paste0(Hugo_Symbol, "_",
Allele, "_",
Tumor_Sample_Barcode, "_",
Start_Position),
change = paste0(Reference_Allele, ">", Allele),
coding = dplyr::case_when(
BIOTYPE != "protein_coding" ~ "non-coding",
TRUE ~ "protein_coding")) %>%
dplyr::select(-which(apply(is.na(.), 2, all)))
# Take a look at this df
mutect2_vaf
Save to a TSV file
# Merge these data.frames together
vaf_df <- strelka2_vaf %>%
dplyr::full_join(mutect2_vaf, by = 'mutation',
suffix = c(".strelka2", ".mutect2")) %>%
# Make a variable that denotes which dataset it is in.
dplyr::mutate(dataset = dplyr::case_when(
is.na(vaf.mutect2) ~ "strelka2_only",
is.na(vaf.strelka2) ~ "mutect2_only",
TRUE ~ "both")) %>%
readr::write_tsv(file.path("results", "combined_results.tsv"))
Session Info:
sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Mojave 10.14.5
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] maftools_2.0.15 Biobase_2.44.0 BiocGenerics_0.30.0
loaded via a namespace (and not attached):
[1] pkgload_1.0.2 tidyr_0.8.3 jsonlite_1.6 splines_3.6.1
[5] foreach_1.4.7 assertthat_0.2.1 yaml_2.2.0 remotes_2.1.0
[9] sessioninfo_1.1.1 pillar_1.4.2 backports_1.1.4 lattice_0.20-38
[13] glue_1.3.1 digest_0.6.20 RColorBrewer_1.1-2 colorspace_1.4-1
[17] htmltools_0.3.6 Matrix_1.2-17 plyr_1.8.4 pkgconfig_2.0.2
[21] devtools_2.1.0 bibtex_0.4.2 purrr_0.3.2 xtable_1.8-4
[25] scales_1.0.0 processx_3.4.1 VennDiagram_1.6.20 tibble_2.1.3
[29] pkgmaker_0.27 ggplot2_3.2.0 usethis_1.5.1 withr_2.1.2
[33] hexbin_1.27.3 lazyeval_0.2.2 cli_1.1.0 survival_2.44-1.1
[37] magrittr_1.5 crayon_1.3.4 evaluate_0.14 memoise_1.1.0
[41] ps_1.3.0 fs_1.3.1 doParallel_1.0.14 NMF_0.21.0
[45] pkgbuild_1.0.3 tools_3.6.1 registry_0.5-1 data.table_1.12.2
[49] prettyunits_1.0.2 hms_0.5.0 formatR_1.7 gridBase_0.4-7
[53] stringr_1.4.0 munsell_0.5.0 cluster_2.1.0 rngtools_1.4
[57] lambda.r_1.2.3 callr_3.3.1 compiler_3.6.1 rlang_0.4.0
[61] futile.logger_1.4.3 grid_3.6.1 iterators_1.0.12 rstudioapi_0.10
[65] base64enc_0.1-3 rmarkdown_1.14 colorblindr_0.1.0 labeling_0.3
[69] testthat_2.2.1 gtable_0.3.0 codetools_0.2-16 reshape2_1.4.3
[73] R6_2.4.0 knitr_1.23 dplyr_0.8.3 zeallot_0.1.0
[77] rprojroot_1.3-2 futile.options_1.0.1 readr_1.3.1 desc_1.2.0
[81] stringi_1.4.3 Rcpp_1.0.2 vctrs_0.2.0 wordcloud_2.6
[85] tidyselect_0.2.5 xfun_0.8